List of used packages:
library(limma)
library(dplyr)
library(readxl)
library(readr)
library(plyr)
library(cowplot)
library(impute)
library(ggbiplot)
library(gplots)
library(reactable)
library(heatmaply)
library(EnhancedVolcano)
library(ggVennDiagram)
We had two softs for mass-spectrometry results analysis: Peaks Xpro and MaxQuant. Firstly, we compared identified proteins using these two softs.
maxq <- read_xlsx("raw_proteomics_MaxQuant_filtered.xlsx")
peaks <- read_csv("data_Peaks.csv")
id_list <- list(peaks$Gene_id, maxq$gene_id)
ggVennDiagram(id_list, category.names = c("Peaks", "MaxQuant"), label_alpha = 0) +
scale_fill_gradient(low = "#0a9278", high = "#f57002") +
labs(title = "Protein identification") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Read raw data obtained from "Peaks" software
data <- read.csv("data_Peaks.csv", stringsAsFactors = F)
# Make tidy data
data$Accession_id <- unlist(lapply(data$Accession, function(x) unlist(strsplit(x, "\\|")[1])[1]))
data$temp <- unlist(lapply(data$Accession, function(x) unlist(strsplit(x, "\\|")[1])[2]))
data$Protein_name <- unlist(lapply(data$temp, function(x) unlist(strsplit(x, "_")[1])[1]))
data <- data[, -111]
# Read factor table
data_factors <- read_xlsx("factor_Peaks.xlsx")
data_factors$Health <- as.factor(data_factors$Health)
data_factors$Differentiation <- as.factor(data_factors$Differentiation)
data_factors$Series <- as.factor(data_factors$Series)
colnames(data)[39:71] <- data_factors$sample
reactable(data[, c(111, 39:71)])
reactable(data_factors)
t_data <- t(data[, 39:71])
colnames(t_data) <- data$Protein_name
## Remove columns where the number of NA is greater than half of observations
cond_1 <- colSums(is.na(t_data)) <= nrow(t_data) / 2
t_data_mindrop <- t_data[, cond_1]
sum(colSums(is.na(t_data)) >= nrow(t_data) / 2)
## [1] 1316
## Remove columns where the number of NA is greater than 2
cond_2 <- colSums(is.na(t_data)) < 2
t_data_maxdrop <- t_data[, cond_2]
sum(colSums(is.na(t_data)) > 2)
## [1] 2344
## Imputation of NA
data_maxdrop <- t(impute.knn(t_data_maxdrop, k = 10)$data)
sum(is.na(data_maxdrop))
## [1] 0
data_mindrop <- t(impute.knn(t_data_mindrop, k = 10)$data)
sum(is.na(data_mindrop))
## [1] 0
Log2 transformation and quantile transformation were used.
# MAX DROP (Drop NA if > 2)
data_norm_log_max <- log2(data_maxdrop + 1)
data_norm_quantile_max <- normalizeQuantiles(data_norm_log_max)
boxplot(data_norm_quantile_max,col = "#0a9278",
border = "black",
main = "Log2 + Quantile normalization",
ylab = "intensities",
xlab = "samples")
# MIN DROP (Drop NA if > half of observations)
data_norm_log_min <- log2(data_mindrop + 1)
data_norm_quantile_min <- normalizeQuantiles(data_norm_log_min)
boxplot(data_norm_quantile_min, col = "#0a9278",
border = "black",
main = "Log2 + Quantile normalization",
ylab = "intensities",
xlab = "samples")
We decided to use “MAX drop” for the further analysis.
PCA for “MAX drop”
# Year
data_pca_maxdrop <- prcomp(t(data_norm_quantile_max), center = T, scale. = F)
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "No correction", color = "Year") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# 1 year
data_pca_maxdrop_1 <- prcomp(t(data_norm_quantile_max)[15:33,], center = T, scale. = F)
ggbiplot(data_pca_maxdrop_1, ellipse = TRUE, groups = data_factors$Differentiation[15:33], labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_discrete(name = "Group") +
scale_shape_discrete(name = "Health") +
scale_color_manual(values = c("#0a9278","#f57002")) +
geom_point(aes(color = data_factors$Differentiation[15:33], shape = data_factors$Health[15:33]), size = 2) +
labs(title = "PCA for MAX DROP / 1 year") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# 2 year
data_pca_maxdrop_2 <- prcomp(t(data_norm_quantile_max)[1:14,], center = T, scale. = F)
ggbiplot(data_pca_maxdrop_2, ellipse = TRUE, groups = data_factors$Differentiation[1:14], labels = NULL, var.axes = FALSE, alpha = 0.7) +
geom_point(aes(color = data_factors$Differentiation[1:14], shape = data_factors$Health[1:14]), size = 2) +
scale_color_discrete(name = "Group") +
scale_shape_discrete(name = "Health") +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "PCA for MAX DROP / 2 year") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Health
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Health, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "PCA for MAX DROP / Health") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Differentiation
ggbiplot(data_pca_maxdrop, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
scale_color_manual(values = c("#0a9278","#f57002")) +
labs(title = "PCA for MAX DROP / Differentiation") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
X <- model.matrix(~ Series, data = data_factors)
# Build a linear model for each protein
fit <- lmFit(data_norm_quantile_max, design = X, method = "robust", maxit = 1000)
# Empirical Bayes statistics
efit <- eBayes(fit)
topTable(efit, coef = 2)
## logFC AveExpr t P.Value adj.P.Val B
## CYTB -3.673090 24.16232 -28.29261 4.146187e-25 5.021033e-22 46.86073
## PFD3 3.652494 21.58231 19.10300 1.106861e-19 6.702045e-17 34.85261
## S10A6 -4.305366 28.53789 -17.60786 1.360747e-18 5.492882e-16 32.37892
## DX39B 1.802131 24.41763 15.43026 7.280926e-17 1.821569e-14 28.42657
## NF1 -4.427363 24.00246 -15.41342 7.520928e-17 1.821569e-14 28.39367
## SC61G 1.659376 23.77067 15.19854 1.140337e-16 2.301580e-14 27.98333
## CDC42 1.734035 25.25963 15.06821 1.470974e-16 2.544784e-14 27.72482
## RLA1 5.630592 20.95293 14.75319 2.740664e-16 4.148680e-14 27.11742
## TMOD3 1.557038 24.26026 14.24639 7.615708e-16 1.024736e-13 26.09511
## PP14B 2.875091 22.86225 13.81546 1.854394e-15 2.245671e-13 25.19693
num_spots <- nrow(data_norm_quantile_max)
full_list <- topTable(efit, coef = 2, number = num_spots,
sort.by = "none")
p_above <- full_list$adj.P.Val <= 0.05
dif_data_year_wo_correction <- data_norm_quantile_max[p_above, ]
sum(p_above)
## [1] 707
heatmaply(dif_data_year_wo_correction, main = "Control vs Differentiation", fontsize_row = 1, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))
EnhancedVolcano(full_list,
lab = rownames(full_list),
title = "Batch effect connecting with year\nof experiment",
subtitle = NULL,
x = "logFC",
y = "adj.P.Val",
pCutoff = 0.05,
FCcutoff = 0.1,
legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
legendPosition = "right",
col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"))